# Description ------------------------------------------------------------------

## This document creates market level outcome variables at baseline and endline

# Packages ---------------------------------------------------------------------
#if packages are not installed, they must be installed with:
# install.packages("package_name"); The name of package must be in quotes
library(dplyr)
library(readr)
library(stringr)
library(haven)
library(labelled)
library(hms)

# Data -------------------------------------------------------------------------

##Endline Data
load("data/2_clean/vendor_end.Rdata")

##Baseline Data
load("data/2_clean/vendor_base.Rdata")

# Make Baseline Measures -------------------------------------------------------
market_lvl_bl <- vendor_base %>%
  group_by(market) %>% 
  summarise(
    c_fee1_full = sum(!is.na(fee1_full)),
    c_fee1_part = sum(!is.na(fee1_part)),
    c_fee1_none = sum(!is.na(fee1_none)),
    c_fee2_always = sum(!is.na(fee2_always)),
    c_fee2_never = sum(!is.na(fee2_never)),
    c_fee2_sometimes = sum(!is.na(fee2_sometimes)),
    c_recent_receipt = sum(!is.na(recent_receipt_7)),
    sd_fee1_full = sd(fee1_full, na.rm = T),
    sd_fee1_part = sd(fee1_part, na.rm = T),
    sd_fee1_none = sd(fee1_none, na.rm = T),
    sd_fee2_always = sd(fee2_always, na.rm = T),
    sd_fee2_sometimes = sd(fee2_sometimes, na.rm = T),
    sd_fee2_never = sd(fee2_sometimes, na.rm = T),
    fee1_full = mean(fee1_full, na.rm = T),
    fee1_part = mean(fee1_part, na.rm = T),
    fee1_none = mean(fee1_none, na.rm = T),
    fee2_always = mean(fee2_always, na.rm = T),
    fee2_sometimes = mean(fee2_sometimes, na.rm = T),
    fee2_never = mean(fee2_never, na.rm = T),
    recent_receipt_7 = mean(recent_receipt_7, na.rm = T),
    recent_receipt_10 = mean(recent_receipt_10, na.rm = T),
    recent_receipt_7_ofcl = mean(recent_receipt_7_ofcl, na.rm = T),
    recent_receipt_7_v2a = mean(recent_receipt_7_v2a, na.rm = T),
    recent_receipt_7_v2b = mean(recent_receipt_7_v2b, na.rm = T),
    recent_receipt_10_ofcl = mean(recent_receipt_10_ofcl, na.rm = T),
    recent_receipt_10_v2a = mean(recent_receipt_10_v2a, na.rm = T),
    recent_receipt_10_v2b = mean(recent_receipt_10_v2b, na.rm = T),
    receipt_shown = mean(receipt_shown)
  )

# Make Endline Measures --------------------------------------------------------
market_lvl_el <- vendor_end %>%
  group_by(market) %>%
  summarise(c_fee1_full = sum(!is.na(fee1_full)),
            c_fee1_part = sum(!is.na(fee1_part)),
            c_fee1_none = sum(!is.na(fee1_none)),
            c_fee2_always = sum(!is.na(fee2_always)),
            c_fee2_never = sum(!is.na(fee2_never)),
            c_fee2_sometimes = sum(!is.na(fee2_sometimes)),
            c_fee1_full_0s = sum(!is.na(fee1_full_0s)),
            c_fee1_part_0s = sum(!is.na(fee1_part_0s)),
            c_fee1_none_0s = sum(!is.na(fee1_none_0s)),
            c_fee2_always_0s = sum(!is.na(fee2_always_0s)),
            c_fee2_never_0s = sum(!is.na(fee2_never_0s)),
            c_fee2_sometimes_0s = sum(!is.na(fee2_sometimes_0s)),
            c_recent_receipt = sum(!is.na(recent_receipt_7)),
            sd_fee1_full = sd(fee1_full, na.rm = T),
            sd_fee1_part = sd(fee1_part, na.rm = T),
            sd_fee1_none = sd(fee1_none, na.rm = T),
            sd_fee2_always = sd(fee2_always, na.rm = T),
            sd_fee2_sometimes = sd(fee2_sometimes, na.rm = T),
            sd_fee2_never = sd(fee2_sometimes, na.rm = T),
            sd_fee1_full_0s = sd(fee1_full_0s, na.rm = T),
            sd_fee1_part_0s = sd(fee1_part_0s, na.rm = T),
            sd_fee1_none_0s = sd(fee1_none_0s, na.rm = T),
            sd_fee2_always_0s = sd(fee2_always_0s, na.rm = T),
            sd_fee2_sometimes_0s = sd(fee2_sometimes_0s, na.rm = T),
            sd_fee2_never_0s = sd(fee2_sometimes_0s, na.rm = T),
            fee1_full = mean(fee1_full, na.rm = T),
            fee1_part = mean(fee1_part, na.rm = T),
            fee1_none = mean(fee1_none, na.rm = T),
            fee2_always = mean(fee2_always, na.rm = T),
            fee2_sometimes = mean(fee2_sometimes, na.rm = T),
            fee2_never = mean(fee2_never, na.rm = T),
            fee1_full_0s = mean(fee1_full_0s, na.rm = T),
            fee1_part_0s = mean(fee1_part_0s, na.rm = T),
            fee1_none_0s = mean(fee1_none_0s, na.rm = T),
            fee2_always_0s = mean(fee2_always_0s, na.rm = T),
            fee2_sometimes_0s = mean(fee2_sometimes_0s, na.rm = T),
            fee2_never_0s = mean(fee2_never_0s, na.rm = T),
            recent_receipt_7 = mean(recent_receipt_7, na.rm = T),
            recent_receipt_10 = mean(recent_receipt_10, na.rm = T),
            recent_receipt_7_ofcl = mean(recent_receipt_7_ofcl, na.rm = T),
            recent_receipt_7_v2a = mean(recent_receipt_7_v2a, na.rm = T),
            recent_receipt_7_v2b = mean(recent_receipt_7_v2b, na.rm = T),
            recent_receipt_10_ofcl = mean(recent_receipt_10_ofcl, na.rm = T),
            recent_receipt_10_v2a = mean(recent_receipt_10_v2a, na.rm = T),
            recent_receipt_10_v2b = mean(recent_receipt_10_v2b, na.rm = T),
            receipt_shown = mean(receipt_shown, na.rm = T))

# Binding endline and baseline together ----------------------------------------
#add endline/baseline identifier
market_lvl_el$Endline <- 1
market_lvl_bl$Endline <- 0

#rowbing together
market_lvl <- bind_rows(market_lvl_bl, market_lvl_el)

#replace current NAs in _0s columns for BL with regular BL values
market_lvl[1:128, c("fee1_full_0s", 
                    "fee1_part_0s",
                    "fee1_none_0s", 
                    "fee2_always_0s",
                    "fee2_sometimes_0s",
                    "fee2_never_0s",
                    "c_fee1_full_0s", 
                    "c_fee1_part_0s",
                    "c_fee1_none_0s", 
                    "c_fee2_always_0s",
                    "c_fee2_sometimes_0s",
                    "c_fee2_never_0s")] <- market_lvl[1:128, c("fee1_full",
                                                          "fee1_part",
                                                          "fee1_none",
                                                          "fee2_always",
                                                          "fee2_sometimes",
                                                          "fee2_never",
                                                          "c_fee1_full", 
                                                          "c_fee1_part",
                                                          "c_fee1_none", 
                                                          "c_fee2_always",
                                                          "c_fee2_sometimes",
                                                          "c_fee2_never")]

#merge in treatment indicator
market_lvl <- market_lvl %>% 
  full_join(read_csv("data/2_clean/treatment_groups.csv"), 
                                       by = c("market" = "Market"))

# make indicator vars
market_lvl <- mutate(market_lvl,
                     BU_treat = str_detect(treatment_status, "BU|BOTH") %>% 
                       as.numeric(),
                     TD_treat = str_detect(treatment_status, "TD|BOTH") %>%
                       as.numeric(),
                     BU = ifelse(BU_treat == 1 & TD_treat == 0,
                                 1, 0),
                     TD = ifelse(BU_treat == 0 & TD_treat == 1,
                                 1, 0),
                     Both = ifelse(BU_treat == 1 & TD_treat == 1,
                                   1, 0))

#merge in treatment assignment block ids
market_lvl <- left_join(market_lvl, 
                        read_dta("data/1_raw/block_ids.dta") %>% 
                          mutate_if(is.labelled, to_factor),
                        by = c("market" = "market", "District" = "district"))

# Data Frames to Facilitate H1 Analysis ----------------------------------------
#for difference in means estimators for market-level tax compliance 
market_lvl_el <- market_lvl[market_lvl$Endline == 1, ]

#make dataframe for just differences
market_lvl_diffs <- market_lvl_el

#create difference for fee1_full
market_lvl_diffs$fee1_full <- market_lvl_el$fee1_full -
  market_lvl$fee1_full[market_lvl$Endline == 0]

#create difference for fee2_always
market_lvl_diffs$fee2_always <- market_lvl_el$fee2_always -
  market_lvl$fee2_always[market_lvl$Endline == 0]

#create differences for recent_receipt_7
market_lvl_diffs$recent_receipt_7 <- market_lvl_el$recent_receipt_7 -
  market_lvl$recent_receipt_7[market_lvl$Endline == 0]

# Save to disk -----------------------------------------------------------------
save(market_lvl, file = "data/3_formatted/market_lvl.RData")
save(market_lvl_diffs, file = "data/3_formatted/market_lvl_diffs.RData")
